import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import seaborn as sns
%matplotlib inline
# Comprovar les versions
print("Pandas ver. {}".format(pd.__version__))
print("Numpy ver. {}".format(np.__version__))
print("Scipy ver. {}".format(sc.__version__))
Pandas ver. 2.0.3 Numpy ver. 1.24.3 Scipy ver. 1.11.1
data = pd.read_csv('Invistico_Airline.csv')
# Mostrem els primers registres
data.head()
| satisfaction | Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Seat comfort | Departure/Arrival time convenient | Food and drink | ... | Online support | Ease of Online booking | On-board service | Leg room service | Baggage handling | Checkin service | Cleanliness | Online boarding | Departure Delay in Minutes | Arrival Delay in Minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | satisfied | Female | Loyal Customer | 65 | Personal Travel | Eco | 265 | 0 | 0 | 0 | ... | 2 | 3 | 3 | 0 | 3 | 5 | 3 | 2 | 0 | 0.0 |
| 1 | satisfied | Male | Loyal Customer | 47 | Personal Travel | Business | 2464 | 0 | 0 | 0 | ... | 2 | 3 | 4 | 4 | 4 | 2 | 3 | 2 | 310 | 305.0 |
| 2 | satisfied | Female | Loyal Customer | 15 | Personal Travel | Eco | 2138 | 0 | 0 | 0 | ... | 2 | 2 | 3 | 3 | 4 | 4 | 4 | 2 | 0 | 0.0 |
| 3 | satisfied | Female | Loyal Customer | 60 | Personal Travel | Eco | 623 | 0 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 1 | 4 | 1 | 3 | 0 | 0.0 |
| 4 | satisfied | Female | Loyal Customer | 70 | Personal Travel | Eco | 354 | 0 | 0 | 0 | ... | 4 | 2 | 2 | 0 | 2 | 4 | 2 | 5 | 0 | 0.0 |
5 rows × 23 columns
# Mostrem el nombre de registres del dataset
data.shape[0]
129880
# Es mostra el nombre d'atributs del conjunt de dades
num_atributs = len(data.columns)
num_atributs
23
# Es mostra el nom dels atributs del dataset
for col in data.columns:
print(col)
satisfaction Gender Customer Type Age Type of Travel Class Flight Distance Seat comfort Departure/Arrival time convenient Food and drink Gate location Inflight wifi service Inflight entertainment Online support Ease of Online booking On-board service Leg room service Baggage handling Checkin service Cleanliness Online boarding Departure Delay in Minutes Arrival Delay in Minutes
# Obtenim el recompte de valors nuls a cada columna
missing_values = data.isnull().sum()
print(missing_values)
satisfaction 0 Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Seat comfort 0 Departure/Arrival time convenient 0 Food and drink 0 Gate location 0 Inflight wifi service 0 Inflight entertainment 0 Online support 0 Ease of Online booking 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Cleanliness 0 Online boarding 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 393 dtype: int64
#Eliminem els valors nuls de la columna Arrival Delay in Minutes
satAir = data.dropna(subset=["Arrival Delay in Minutes"])
# Tornem a veure el recompte de valors nuls a cada columna
missing_values = satAir.isnull().sum()
print(missing_values)
satisfaction 0 Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Seat comfort 0 Departure/Arrival time convenient 0 Food and drink 0 Gate location 0 Inflight wifi service 0 Inflight entertainment 0 Online support 0 Ease of Online booking 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Cleanliness 0 Online boarding 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 0 dtype: int64
freq = satAir["satisfaction"].value_counts()
print(freq)
plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Satisfaction")
plt.show()
satisfaction satisfied 70882 dissatisfied 58605 Name: count, dtype: int64
from pywaffle import Waffle
gender_counts = satAir['Gender'].value_counts()
gender_percentage = (gender_counts / len(satAir)) * 100
# Creem el Waffle chart
fig = plt.figure(
FigureClass=Waffle,
rows=5, # files de gent
figsize=(11, 6),
values=gender_percentage,
labels=[f"Female ({gender_percentage['Female']:.2f}%)", f"Male ({gender_percentage['Male']:.2f}%)"],
colors=["#FF82AB", "#1E90FE"],
icons=['female', 'male'],
legend={'loc': 'lower center',
'bbox_to_anchor': (0.5, -0.5),
'ncol': len(gender_counts),
'framealpha': 0,
'fontsize': 20
},
icon_size=30,
icon_legend=True,
title={'label': 'Distribució segons el gènere',
'loc': 'center',
'fontdict': {'fontsize': 20}
}
)
plt.show()
def calculate_percentage_cross_tab_with_style(df, x):
# Creem la tabulació creuada
cross_tab = pd.crosstab(df[x], df['satisfaction'])
# Convertim els counts en percentages i ajustem els decimals
percentage_cross_tab = cross_tab.apply(lambda row: row / row.sum() * 100, axis=1)
rounded_percentage_cross_tab = percentage_cross_tab.round(2)
# Apliquem .style.background_gradient
styled_percentage_cross_tab = rounded_percentage_cross_tab.style.background_gradient(cmap='Blues')
return styled_percentage_cross_tab
calculate_percentage_cross_tab_with_style(satAir, 'Gender')
| satisfaction | dissatisfied | satisfied |
|---|---|---|
| Gender | ||
| Female | 34.860000 | 65.140000 |
| Male | 55.970000 | 44.030000 |
import plotly.express as px
age_bins = [0, 20, 30, 40, 50, 60, 100]
age_labels = ["0-20", "21-30", "31-40", "41-50", "51-60", "61+"]
satAir.loc[:, 'Age Group'] = pd.cut(satAir['Age'], bins=age_bins, labels=age_labels, right=False)
age_group_counts = satAir['Age Group'].value_counts().reset_index()
age_group_counts.columns = ['Age Group', 'Count']
age_group_counts.style.background_gradient(cmap='Blues')
| Age Group | Count | |
|---|---|---|
| 0 | 41-50 | 29555 |
| 1 | 21-30 | 26098 |
| 2 | 31-40 | 25623 |
| 3 | 51-60 | 23902 |
| 4 | 61+ | 12168 |
| 5 | 0-20 | 12141 |
fig = px.bar(
age_group_counts,
x='Age Group',
y='Count',
title='Grups edat',
labels={'Count': 'Number of Customers'},
color='Age Group',
color_discrete_sequence=px.colors.sequential.Blues[::-1],
)
fig.update_layout(
xaxis_title='Grup edat',
yaxis_title='Nombre de clients',
font=dict(size=12),
title_font=dict(size=16),
showlegend=False,
plot_bgcolor='#FFFFFF',
margin=dict(l=40, r=40, t=80, b=40),
)
fig.show()
calculate_percentage_cross_tab_with_style(satAir, 'Age Group')
| satisfaction | dissatisfied | satisfied |
|---|---|---|
| Age Group | ||
| 0-20 | 56.680000 | 43.320000 |
| 21-30 | 55.490000 | 44.510000 |
| 31-40 | 49.950000 | 50.050000 |
| 41-50 | 34.270000 | 65.730000 |
| 51-60 | 33.300000 | 66.700000 |
| 61+ | 52.230000 | 47.770000 |
satisfaction_by_gender_age = satAir.groupby(['Gender', 'Age Group'])['satisfaction'].value_counts(normalize=True).unstack()
satisfaction_by_gender_age.style.background_gradient(cmap='Blues')
| satisfaction | dissatisfied | satisfied | |
|---|---|---|---|
| Gender | Age Group | ||
| Female | 0-20 | 0.323343 | 0.676657 |
| 21-30 | 0.485693 | 0.514307 | |
| 31-40 | 0.419186 | 0.580814 | |
| 41-50 | 0.269948 | 0.730052 | |
| 51-60 | 0.241408 | 0.758592 | |
| 61+ | 0.321965 | 0.678035 | |
| Male | 0-20 | 0.815895 | 0.184105 |
| 21-30 | 0.630902 | 0.369098 | |
| 31-40 | 0.580095 | 0.419905 | |
| 41-50 | 0.417214 | 0.582786 | |
| 51-60 | 0.425130 | 0.574870 | |
| 61+ | 0.725497 | 0.274503 |
fig = px.histogram(satAir, x='Age', color='Gender', marginal='box',
title='Distribució edat per gènere',
labels={'Age': 'Age'},
color_discrete_sequence=['#66B2FF', '#FF69B4'])
fig.show()
freq = satAir["Customer Type"].value_counts()
print(freq)
plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Customer Type")
plt.show()
Customer Type Loyal Customer 105773 disloyal Customer 23714 Name: count, dtype: int64
g = sns.catplot(x="satisfaction", col="Customer Type", col_wrap=2, data=satAir, kind="count")
average_age_by_customer_type = satAir.groupby('Customer Type')['Age'].mean().reset_index()
average_age_by_customer_type.style.background_gradient(cmap='Blues')
| Customer Type | Age | |
|---|---|---|
| 0 | Loyal Customer | 41.463625 |
| 1 | disloyal Customer | 30.352534 |
fig = px.box(
satAir, x='Customer Type', y='Age', color='Customer Type',
title='Distribució Edat per Customer Type',
labels={'Age': 'Age', 'Customer Type': 'Customer Type'},
color_discrete_sequence=['#98F5FF', '#08306B']
)
fig.show()
freq = satAir["Type of Travel"].value_counts()
print(freq)
plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Type of Travel")
plt.show()
Type of Travel Business travel 89445 Personal Travel 40042 Name: count, dtype: int64
calculate_percentage_cross_tab_with_style(satAir, 'Type of Travel')
| satisfaction | dissatisfied | satisfied |
|---|---|---|
| Type of Travel | ||
| Business travel | 41.630000 | 58.370000 |
| Personal Travel | 53.360000 | 46.640000 |
freq = satAir["Class"].value_counts()
print(freq)
plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Class")
plt.show()
Class Business 61990 Eco 58117 Eco Plus 9380 Name: count, dtype: int64
calculate_percentage_cross_tab_with_style(satAir, 'Class')
| satisfaction | dissatisfied | satisfied |
|---|---|---|
| Class | ||
| Business | 29.060000 | 70.940000 |
| Eco | 60.600000 | 39.400000 |
| Eco Plus | 57.280000 | 42.720000 |
import plotly.express as px
age_range_fig = px.histogram(satAir, x="Age", title="Distribució de les edats")
age_range_fig.update_traces(marker=dict(color='skyblue'))
age_range_fig.update_layout(
title=dict(text="Distribució de les edats", x=0.5, y=0.95, xanchor='center', yanchor='top'),
xaxis=dict(title="Age"),
yaxis=dict(title="Count"),
showlegend=False,
bargap=0.1,
plot_bgcolor='white',
font=dict(family="Arial", size=12),
)
age_range_fig.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,6))
axes = axes.ravel() # Canviem la matriu 2D en una matriu 1d contínua
sns.histplot(x='Flight Distance', data=satAir, ax=axes[0])
sns.kdeplot(data=satAir, x ="Flight Distance", hue="satisfaction", ax=axes[1], multiple="stack")
<Axes: xlabel='Flight Distance', ylabel='Density'>
fig = px.box(satAir, x='Class', y='Flight Distance', color='Class',
title='Distribució de la distància de vol per classe',
labels={'Flight Distance': 'Distance'}
)
fig.show()
g = sns.catplot(x="Flight Distance", y="Type of Travel", hue="satisfaction", col="Class", data=satAir, kind="bar", height=4.5, aspect=.8)
average_age_by_purpose = satAir.groupby('Type of Travel')['Age'].mean().reset_index()
fig = px.bar(average_age_by_purpose, x='Type of Travel', y='Age',
title='Edat mitjana segons el propòsit del viatge',
labels={'Age', 'Type of Travel'},
color='Type of Travel',
color_discrete_sequence=['#66B2FF', '#90EE90'],
)
fig.show()
fig = px.scatter(satAir, x='Departure Delay in Minutes', y='Arrival Delay in Minutes', color='satisfaction',
title='Satisfacció de Departure Delay i Arrival Delay',
labels={'Departure Delay in Minutes': 'Departure Delay', 'Arrival Delay in Minutes': 'Arrival Delay'},
color_discrete_sequence=['#90EE90', '#66B2FF'])
fig.show()
g = sns.catplot(x="Class", y="Departure Delay in Minutes", hue="satisfaction", col="Type of Travel", data=satAir, kind="bar")
g = sns.catplot(x="Class", y="Arrival Delay in Minutes", hue="satisfaction", col="Type of Travel", data=satAir, kind="bar")
num_vars = ["Seat comfort", "Departure/Arrival time convenient",
"Food and drink", "Gate location", "Inflight wifi service", "Inflight entertainment",
"Online support", "Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
"Checkin service", "Cleanliness", "Online boarding"]
fig, ax = plt.subplots(7, 2, figsize = (15, 50))
for i, num_var in enumerate(num_vars):
ax[i//2][i%2].hist(satAir[num_var], bins=15)
ax[i//2][i%2].set_title(num_var)
def create_grouped_bar_chart(x, y, df, color1, color2):
# Percentatge de satisfacció
satisfaction_percentage = (
df.groupby([x, y]).size() /
df.groupby([x]).size()
).reset_index(name='Percentage').round(4)
satisfaction_percentage['Percentage'] *= 100
# Creem un bar chart agrupat
fig = px.bar(
satisfaction_percentage,
x=x,
y='Percentage',
color=y,
barmode='group',
title=f'{x} vs {y}',
labels={'Percentage': 'Percentage of Customers'},
color_discrete_sequence=[color1, color2],
)
fig.show()
Services = ["Seat comfort", "Departure/Arrival time convenient",
"Food and drink", "Gate location", "Inflight wifi service", "Inflight entertainment",
"Online support", "Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
"Checkin service", "Cleanliness", "Online boarding"]
dissatisfied_color = '#90EE90'
satisfied_color = '#66B2FF'
for i in range(len(Services)):
create_grouped_bar_chart(Services[i], 'satisfaction', satAir, satisfied_color, dissatisfied_color)
satisfaction = {"satisfaction": {"dissatisfied": 0, "satisfied": 1}}
satAir = satAir.replace(satisfaction)
satAir['satisfaction'].value_counts()
satisfaction 1 70882 0 58605 Name: count, dtype: int64
corr_all = satAir.drop(columns=["Gender", "Customer Type", "Type of Travel", "Class","Age Group"]).corr()
plt.figure(figsize=(25, 25))
cmap = sns.diverging_palette(150, 1, as_cmap=True)
sns.heatmap(corr_all, cmap=cmap, vmax=None, center=0, square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .9})
plt.show()